1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.lucene.analysis.cn.smart.hhmm;
19
20 import java.io.DataInputStream;
21 import java.io.IOException;
22 import java.io.InputStream;
23 import java.io.ObjectInputStream;
24 import java.io.ObjectOutputStream;
25 import java.nio.ByteBuffer;
26 import java.nio.ByteOrder;
27 import java.nio.file.Files;
28 import java.nio.file.Path;
29 import java.nio.file.Paths;
30
31 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
32 import org.apache.lucene.analysis.cn.smart.Utility;
33
34
35
36
37
38 class WordDictionary extends AbstractDictionary {
39
40 private WordDictionary() {
41 }
42
43 private static WordDictionary singleInstance;
44
45
46
47
48 public static final int PRIME_INDEX_LENGTH = 12071;
49
50
51
52
53
54
55
56
57 private short[] wordIndexTable;
58
59 private char[] charIndexTable;
60
61
62
63
64
65
66
67
68
69
70 private char[][][] wordItem_charArrayTable;
71
72 private int[][] wordItem_frequencyTable;
73
74
75
76
77
78
79
80 public synchronized static WordDictionary getInstance() {
81 if (singleInstance == null) {
82 singleInstance = new WordDictionary();
83 try {
84 singleInstance.load();
85 } catch (IOException e) {
86 String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
87 singleInstance.load(wordDictRoot);
88 } catch (ClassNotFoundException e) {
89 throw new RuntimeException(e);
90 }
91 }
92 return singleInstance;
93 }
94
95
96
97
98
99
100 public void load(String dctFileRoot) {
101 String dctFilePath = dctFileRoot + "/coredict.dct";
102 Path serialObj = Paths.get(dctFileRoot + "/coredict.mem");
103
104 if (Files.exists(serialObj) && loadFromObj(serialObj)) {
105
106 } else {
107 try {
108 wordIndexTable = new short[PRIME_INDEX_LENGTH];
109 charIndexTable = new char[PRIME_INDEX_LENGTH];
110 for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
111 charIndexTable[i] = 0;
112 wordIndexTable[i] = -1;
113 }
114 wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
115 wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
116
117 loadMainDataFromFile(dctFilePath);
118 expandDelimiterData();
119 mergeSameWords();
120 sortEachItems();
121
122 } catch (IOException e) {
123 throw new RuntimeException(e.getMessage());
124 }
125
126 saveToObj(serialObj);
127 }
128
129 }
130
131
132
133
134
135
136 public void load() throws IOException, ClassNotFoundException {
137 InputStream input = this.getClass().getResourceAsStream("coredict.mem");
138 loadFromObjectInputStream(input);
139 }
140
141 private boolean loadFromObj(Path serialObj) {
142 try {
143 loadFromObjectInputStream(Files.newInputStream(serialObj));
144 return true;
145 } catch (Exception e) {
146 throw new RuntimeException(e);
147 }
148 }
149
150 private void loadFromObjectInputStream(InputStream serialObjectInputStream)
151 throws IOException, ClassNotFoundException {
152 ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
153 wordIndexTable = (short[]) input.readObject();
154 charIndexTable = (char[]) input.readObject();
155 wordItem_charArrayTable = (char[][][]) input.readObject();
156 wordItem_frequencyTable = (int[][]) input.readObject();
157
158 input.close();
159 }
160
161 private void saveToObj(Path serialObj) {
162 try {
163 ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(
164 serialObj));
165 output.writeObject(wordIndexTable);
166 output.writeObject(charIndexTable);
167 output.writeObject(wordItem_charArrayTable);
168 output.writeObject(wordItem_frequencyTable);
169 output.close();
170
171 } catch (Exception e) {
172
173 }
174 }
175
176
177
178
179
180
181
182
183 private int loadMainDataFromFile(String dctFilePath) throws IOException {
184 int i, cnt, length, total = 0;
185
186
187 int[] buffer = new int[3];
188 byte[] intBuffer = new byte[4];
189 String tmpword;
190 DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath)));
191
192
193 for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
194
195
196
197 dctFile.read(intBuffer);
198
199 cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
200 if (cnt <= 0) {
201 wordItem_charArrayTable[i] = null;
202 wordItem_frequencyTable[i] = null;
203 continue;
204 }
205 wordItem_charArrayTable[i] = new char[cnt][];
206 wordItem_frequencyTable[i] = new int[cnt];
207 total += cnt;
208 int j = 0;
209 while (j < cnt) {
210
211 dctFile.read(intBuffer);
212 buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
213 .getInt();
214 dctFile.read(intBuffer);
215 buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
216 .getInt();
217 dctFile.read(intBuffer);
218 buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
219 .getInt();
220
221
222 wordItem_frequencyTable[i][j] = buffer[0];
223
224 length = buffer[1];
225 if (length > 0) {
226 byte[] lchBuffer = new byte[length];
227 dctFile.read(lchBuffer);
228 tmpword = new String(lchBuffer, "GB2312");
229
230
231 wordItem_charArrayTable[i][j] = tmpword.toCharArray();
232 } else {
233
234 wordItem_charArrayTable[i][j] = null;
235 }
236
237 j++;
238 }
239
240 String str = getCCByGB2312Id(i);
241 setTableIndex(str.charAt(0), i);
242 }
243 dctFile.close();
244 return total;
245 }
246
247
248
249
250
251
252 private void expandDelimiterData() {
253 int i;
254 int cnt;
255
256
257 int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
258 i = 0;
259 while (i < wordItem_charArrayTable[delimiterIndex].length) {
260 char c = wordItem_charArrayTable[delimiterIndex][i][0];
261 int j = getGB2312Id(c);
262 if (wordItem_charArrayTable[j] == null) {
263
264 int k = i;
265
266 while (k < wordItem_charArrayTable[delimiterIndex].length
267 && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
268 k++;
269 }
270
271
272 cnt = k - i;
273 if (cnt != 0) {
274 wordItem_charArrayTable[j] = new char[cnt][];
275 wordItem_frequencyTable[j] = new int[cnt];
276 }
277
278
279 for (k = 0; k < cnt; k++, i++) {
280
281 wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
282 wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
283 System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
284 wordItem_charArrayTable[j][k], 0,
285 wordItem_charArrayTable[j][k].length);
286 }
287 setTableIndex(c, j);
288 }
289 }
290
291 wordItem_charArrayTable[delimiterIndex] = null;
292 wordItem_frequencyTable[delimiterIndex] = null;
293 }
294
295
296
297
298 private void mergeSameWords() {
299 int i;
300 for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
301 if (wordItem_charArrayTable[i] == null)
302 continue;
303 int len = 1;
304 for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
305 if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
306 wordItem_charArrayTable[i][j - 1], 0) != 0)
307 len++;
308
309 }
310 if (len < wordItem_charArrayTable[i].length) {
311 char[][] tempArray = new char[len][];
312 int[] tempFreq = new int[len];
313 int k = 0;
314 tempArray[0] = wordItem_charArrayTable[i][0];
315 tempFreq[0] = wordItem_frequencyTable[i][0];
316 for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
317 if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
318 tempArray[k], 0) != 0) {
319 k++;
320
321 tempArray[k] = wordItem_charArrayTable[i][j];
322 tempFreq[k] = wordItem_frequencyTable[i][j];
323 } else {
324
325 tempFreq[k] += wordItem_frequencyTable[i][j];
326 }
327 }
328
329 wordItem_charArrayTable[i] = tempArray;
330 wordItem_frequencyTable[i] = tempFreq;
331 }
332 }
333 }
334
335 private void sortEachItems() {
336 char[] tmpArray;
337 int tmpFreq;
338 for (int i = 0; i < wordItem_charArrayTable.length; i++) {
339 if (wordItem_charArrayTable[i] != null
340 && wordItem_charArrayTable[i].length > 1) {
341 for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
342 for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
343 if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
344 wordItem_charArrayTable[i][j2], 0) > 0) {
345 tmpArray = wordItem_charArrayTable[i][j];
346 tmpFreq = wordItem_frequencyTable[i][j];
347 wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
348 wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
349 wordItem_charArrayTable[i][j2] = tmpArray;
350 wordItem_frequencyTable[i][j2] = tmpFreq;
351 }
352 }
353 }
354 }
355 }
356 }
357
358
359
360
361
362 private boolean setTableIndex(char c, int j) {
363 int index = getAvaliableTableIndex(c);
364 if (index != -1) {
365 charIndexTable[index] = c;
366 wordIndexTable[index] = (short) j;
367 return true;
368 } else
369 return false;
370 }
371
372 private short getAvaliableTableIndex(char c) {
373 int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
374 int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
375 if (hash1 < 0)
376 hash1 = PRIME_INDEX_LENGTH + hash1;
377 if (hash2 < 0)
378 hash2 = PRIME_INDEX_LENGTH + hash2;
379 int index = hash1;
380 int i = 1;
381 while (charIndexTable[index] != 0 && charIndexTable[index] != c
382 && i < PRIME_INDEX_LENGTH) {
383 index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
384 i++;
385 }
386
387
388 if (i < PRIME_INDEX_LENGTH
389 && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
390 return (short) index;
391 } else
392 return -1;
393 }
394
395 private short getWordItemTableIndex(char c) {
396 int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
397 int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
398 if (hash1 < 0)
399 hash1 = PRIME_INDEX_LENGTH + hash1;
400 if (hash2 < 0)
401 hash2 = PRIME_INDEX_LENGTH + hash2;
402 int index = hash1;
403 int i = 1;
404 while (charIndexTable[index] != 0 && charIndexTable[index] != c
405 && i < PRIME_INDEX_LENGTH) {
406 index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
407 i++;
408 }
409
410 if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
411 return (short) index;
412 } else
413 return -1;
414 }
415
416
417
418
419
420
421
422
423
424
425
426 private int findInTable(short knownHashIndex, char[] charArray) {
427 if (charArray == null || charArray.length == 0)
428 return -1;
429
430 char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
431 int start = 0, end = items.length - 1;
432 int mid = (start + end) / 2, cmpResult;
433
434
435 while (start <= end) {
436 cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
437
438 if (cmpResult == 0)
439 return mid;
440 else if (cmpResult < 0)
441 start = mid + 1;
442 else if (cmpResult > 0)
443 end = mid - 1;
444
445 mid = (start + end) / 2;
446 }
447 return -1;
448 }
449
450
451
452
453
454
455
456
457 public int getPrefixMatch(char[] charArray) {
458 return getPrefixMatch(charArray, 0);
459 }
460
461
462
463
464
465
466
467
468
469 public int getPrefixMatch(char[] charArray, int knownStart) {
470 short index = getWordItemTableIndex(charArray[0]);
471 if (index == -1)
472 return -1;
473 char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
474 int start = knownStart, end = items.length - 1;
475
476 int mid = (start + end) / 2, cmpResult;
477
478
479 while (start <= end) {
480 cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
481 if (cmpResult == 0) {
482
483 while (mid >= 0
484 && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
485 mid--;
486 mid++;
487 return mid;
488 } else if (cmpResult < 0)
489 end = mid - 1;
490 else
491 start = mid + 1;
492 mid = (start + end) / 2;
493 }
494 return -1;
495 }
496
497
498
499
500
501
502
503 public int getFrequency(char[] charArray) {
504 short hashIndex = getWordItemTableIndex(charArray[0]);
505 if (hashIndex == -1)
506 return 0;
507 int itemIndex = findInTable(hashIndex, charArray);
508 if (itemIndex != -1)
509 return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
510 return 0;
511
512 }
513
514
515
516
517
518
519
520
521 public boolean isEqual(char[] charArray, int itemIndex) {
522 short hashIndex = getWordItemTableIndex(charArray[0]);
523 return Utility.compareArray(charArray, 1,
524 wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
525 }
526
527 }